Data Analysis Using R

Functions

# create UpSet plots for AST measurements and fluoroquinolone resistance mechanisms
upset_plot <- function(df, x, y) {
# df - dataframe; x - Flq resistance mechanism column; y - Laboratory typing method column
  require(dplyr, ggplot2, ggupset)
  
  # checks if lab typing method is MIC or disk diffusion 
  if (grepl('MIC.*$', df[y])) {
    df %>%
      group_by(strain, Measurement) %>% 
      # !! sym() allows one to pass a column name as an argument
      summarize(res_mech_list = list(!! sym(x))) %>%
      ggplot(aes(res_mech_list, Measurement)) + geom_violin() + geom_count() + 
      # EUCAST cut-off of 0.5 mg/L
      geom_hline(aes(yintercept = 0.5, linetype = "EUCAST"), alpha = 0.6, color = "black") +
      # CLSI cut-off of 1 mg/L
      geom_hline(aes(yintercept = 1.0, linetype = "CLSI"), color = "black") +
      labs(x = "Fluoroquinolone resistance mutations", y = "Ciprofloxacin MIC measurement (mg/L)",
           size = "Number of strains", linetype = "MIC breakpoints") +
      # convert y-axis to log2 scale and labels to 2 dp
      scale_y_continuous(trans = log2_trans(), breaks = 2^(-5:9), labels = function(x) round(as.numeric(x), digits = 2)) +
      scale_linetype_manual(values = c("dotted", "solid")) +
      scale_x_upset() +
      theme_combmatrix(combmatrix.panel.line.size = 0.8)
  } else {
      df %>%
        group_by(strain, Measurement) %>% 
        summarize(res_mech_list = list(!! sym(x))) %>%
        ggplot(aes(res_mech_list, Measurement)) + geom_violin() + geom_count() + 
        # EUCAST cut-off of 22mm
        geom_hline(aes(yintercept = 22, linetype = "EUCAST"), alpha = 0.6, color = "black") +
        # CLSI cut-off of 21mm
        geom_hline(aes(yintercept = 21, linetype = "CLSI"), color = "black") +
        labs(title = "Disk diffusion measurements", x = 
               "Fluoroquinolone resistance mutations", y = "Ciprofloxacin disk diffusion measurement (mm)", 
             size = "Number of strains", linetype = "Disk diffusion breakpoints") +
        scale_linetype_manual(values = c("dotted", "solid")) +
        scale_x_upset() +
        theme_combmatrix(combmatrix.panel.line.size = 0.8)
  }
}

Load required libraries

library(here)
library(tidyverse)
library(scales)
library(ggupset)
library(RColorBrewer)
library(collapse)
library(DT)

Load antibiogram data and filter for ciprofloxacin

# read antibiogram data
antibiogram <- read.delim(file = "antibiogram_combined.tsv", 
                                header = TRUE, sep = "\t")

# filter for ciprofloxacin
cipro_antibiogram <- antibiogram %>%
  filter(Antibiotic == "ciprofloxacin") %>%
  select(strain:ST, Flq_mutations, Flq_acquired, Omp_mutations, Antibiotic:Measurement.Round) %>%
  mutate(Resistance.phenotype = case_when(grepl('MIC.*$', Laboratory.Typing.Method) & Measurement <= 0.25 ~ 
                                            "susceptible",
                                          grepl('MIC.*$', Laboratory.Typing.Method) & Testing.standard == 
                                            "EUCAST" & Measurement > 0.5 ~ "resistant",
                                          grepl('MIC.*$', Laboratory.Typing.Method) & Testing.standard == 
                                            "CLSI" & Measurement >= 1 ~ "resistant",
                                          Laboratory.Typing.Method == "Disk diffusion" & Testing.standard == 
                                            "EUCAST" & Measurement >= 25 ~ "susceptible",
                                          Laboratory.Typing.Method == "Disk diffusion" & Testing.standard == 
                                            "EUCAST" & Measurement < 22 ~ "resistant",
                                          Laboratory.Typing.Method == "Disk diffusion" & Testing.standard == 
                                            "CLSI" & Measurement >= 26 ~ "susceptible",
                                          Laboratory.Typing.Method == "Disk diffusion" & Testing.standard == 
                                            "CLSI" & Measurement <= 21 ~ "resistant",
                                          TRUE ~ "resistant"))

#write_delim(cipro_antibiogram, file = "cipro_antibiogram.tsv", delim = "\t")

# combine antibiogram with mutations not in kleborate data file
no_mut_kleborate <- read.delim(file = "mutations_not_in_kleborate.tsv",
                               header = TRUE, sep = "\t")

cipro_antibiogram <- cipro_antibiogram %>%
  left_join(no_mut_kleborate, by = c("strain")) %>%
  mutate(Flq_mutations = case_when(Flq_mutations.y %in% NA ~ Flq_mutations.x, 
                                   Flq_mutations.x == "-" & Flq_mutations.y %in% NA ~ Flq_mutations.x,
                                   TRUE ~ Flq_mutations.y)) %>%
  select(- c(Flq_mutations.x, Flq_mutations.y)) %>%
  relocate(Flq_mutations, .after = ST)

Summaries

Sample count per dataset

cipro_antibiogram %>%
  group_by(index) %>%
  ggplot(aes(x = index)) +
  geom_bar(fill = "#ff8000", alpha = 0.6) +
  geom_text(aes(label = ..count..), stat = "count", nudge_y = 60) +
  labs(title = "Total sample count per dataset", y = "Total sample count", fill = "Lab typing method") +
  theme(axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))

Total sample counts vs measurements for different lab typing methods

Disk diffusion

cipro_antibiogram %>%
  group_by(Laboratory.Typing.Method) %>%
  filter(Laboratory.Typing.Method == "Disk diffusion") %>%
  ggplot(aes(Measurement, fill = index)) + geom_bar() + 
  # EUCAST cut-off of 22mm
  geom_vline(aes(xintercept = 22, linetype = "EUCAST"), color = "black") +
  # CLSI cut-off of 21mm
  geom_vline(aes(xintercept = 21, linetype = "CLSI"), color = "black") +
  labs(title = "Disk Diffusion Measurements", x = "Measurement (mm)", y = "Frequency", fill = "Dataset", 
       linetype = "Disk diffusion \nbreakpoints") +
  scale_linetype_manual(values = c("dotted", "solid"))

MIC (broth dilution)

cipro_antibiogram %>%
  group_by(Laboratory.Typing.Method) %>%
  filter(Laboratory.Typing.Method == "MIC (broth dilution)") %>%
  ggplot(aes(x = factor(Measurement), fill = index)) + geom_bar() + 
  # EUCAST cut-off of 0.5 mg/L
  geom_vline(aes(xintercept = 6, linetype = "EUCAST"), alpha = 0.6, color = "black") +
  # CLSI cut-off of 1 mg/L
  geom_vline(aes(xintercept = 8, linetype = "CLSI"), color = "black") +
  labs(title = "MIC Broth Dilution Measurements", x = "Measurement (mg/L)", y = "Frequency", fill = "Dataset", 
       linetype = "MIC breakpoints") +
  scale_linetype_manual(values = c("dotted", "solid")) 

MIC (agar dilution)

cipro_antibiogram %>%
  group_by(Laboratory.Typing.Method) %>%
  filter(Laboratory.Typing.Method == "MIC (agar dilution)") %>%
  ggplot(aes(x = factor(Measurement), fill = index)) + geom_bar() + 
  # EUCAST cut-off of 0.5 mg/L
  geom_vline(aes(xintercept = 5, linetype = "EUCAST"), alpha = 0.6, color = "black") +
  # CLSI cut-off of 1 mg/L
  geom_vline(aes(xintercept = 6, linetype = "CLSI"), color = "black") +
  labs(title = "MIC Agar Dilution Measurements", x = "Measurement (mg/L)", y = "Frequency", fill = "Dataset", 
       linetype = "MIC breakpoints") +
  # round off x-axis labels to 2dp
  scale_x_discrete(labels = function(x) round(as.numeric(x), digits = 2)) +
  scale_linetype_manual(values = c("dotted", "solid")) 

UpSet plots for different lab typing methods to visualise resistance mechanism combinations

# combine Flq_mutations, Flq_acquired and Omp_mutations into one column
cipro_res_mech <- cipro_antibiogram %>%
  pivot_longer(Flq_mutations:Omp_mutations, names_to = "Flq.resistance.mech", 
               values_to = "Resistance.mech.type") %>%
  separate(Resistance.mech.type, into = c("type_A", "type_B", "type_C"), sep = "[;]")

cipro_res_mech <- cipro_res_mech %>%
  pivot_longer(cols = names(cipro_res_mech)[26:28], values_to = "Resistance.mech.type") %>%
  drop_na(Resistance.mech.type) %>%
  select(-name) 

rare_res_qnr_genes <- c("qnrB7", "qnrE2", "qnrS2", "qnrVC1", "qnrVC6")


# now label all OmpK35 truncations as such, e.g. "OmpK35-21%" -> "OmpK35-trunc"
# as these all reflect the same mechanism (i.e. truncation/loss of the porin)
cipro_res_mech <- cipro_res_mech %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, grepl("OmpK35-", Resistance.mech.type), "OmpK35-trunc")) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, grepl("OmpK36-", Resistance.mech.type), "OmpK36-trunc")) %>%
  # based on statistical analysis (chunk prop_test), combine QRDR mutations to have just 4 types (GyrA-83, GyrA-87, ParC-80, ParC-84)
  # except GyrA-87H which is found in one strain and is sensitive
  mutate(Resistance.mech.type = replace(Resistance.mech.type, startsWith(Resistance.mech.type, "GyrA-83"), "GyrA-83")) %>%
  mutate(Resistance.mech.type = case_when(startsWith(Resistance.mech.type, "GyrA-87") & Resistance.mech.type != "GyrA-87H" ~ "GyrA-87",
                                          TRUE ~ Resistance.mech.type)) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, startsWith(Resistance.mech.type, "ParC-80"), "ParC-80")) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, startsWith(Resistance.mech.type, "ParC-84"), "ParC-84")) %>%
  # also combine the following acquired genes
  mutate(Resistance.mech.type = replace(Resistance.mech.type, startsWith(Resistance.mech.type, "qepA2"), "qepA2")) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, startsWith(Resistance.mech.type, "qnrA1"), "qnrA1")) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, startsWith(Resistance.mech.type, "qnrB1"), "qnrB1")) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, startsWith(Resistance.mech.type, "qnrB19"), "qnrB19")) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, startsWith(Resistance.mech.type, "qnrB2"), "qnrB2")) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, startsWith(Resistance.mech.type, "qnrS1"), "qnrS1")) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, grepl(paste(rare_res_qnr_genes, collapse = "|"), Resistance.mech.type), "rare_res_qnr"))

# explicitly label strains with wildtype QRDR (gyrA/parC), wt porin (Omp), and no acquired genes as such
cipro_res_mech <- cipro_res_mech %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, Flq.resistance.mech=="Flq_mutations" & Resistance.mech.type=="-", "wt_QRDR")) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, Flq.resistance.mech=="Omp_mutations" & Resistance.mech.type=="-", "wt_Omp")) %>%
  mutate(Resistance.mech.type = replace(Resistance.mech.type, Flq.resistance.mech=="Flq_acquired" & Resistance.mech.type=="-", "no_acquired_genes"))

Summary bar plot of resistance mechanisms

Count table

# count number of occurrences of the different fluoroquinolone resistance determinants
summ_res_mech <- cipro_res_mech %>%
  filter(Resistance.mech.type != "wt_Omp" & Resistance.mech.type != "wt_QRDR" & 
           Resistance.mech.type != "no_acquired_genes") %>%
  group_by(Flq.resistance.mech, Resistance.mech.type) %>%
  summarise(count = n()) 

DT::datatable(summ_res_mech,
              width = "50%",
              extensions = "FixedHeader",
              rownames = FALSE,
              options = list(paging=TRUE, fixedHeader=TRUE, scrollX=TRUE))

Summary bar plot

summ_res_mech %>%
  ggplot(aes(Resistance.mech.type, count, fill = Flq.resistance.mech)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = count), stat = "identity", nudge_y = 60) +
  labs(title = "Total sample count per resistance type", x = "Resistance mechanism",
       y = "Total sample count", fill = "Fluoroquinolone resistance \nmechanism type") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  scale_fill_brewer(palette = "Accent")

UpSet plots

MIC (broth dilution)

cipro_res_mech %>%
  filter(!(Resistance.mech.type == "no_acquired_genes")) %>% # exclude these rows as not informative
  filter(Laboratory.Typing.Method == "MIC (broth dilution)") %>%
  upset_plot("Resistance.mech.type", "Laboratory.Typing.Method") +
  labs(title = "MIC (broth dilution) measurements") 

MIC (agar dilution)

cipro_res_mech %>%
  filter(!(Resistance.mech.type == "no_acquired_genes")) %>% # exclude these rows as not informative
  filter(Laboratory.Typing.Method == "MIC (agar dilution)") %>%
  upset_plot("Resistance.mech.type", "Laboratory.Typing.Method") +
  labs(title = "MIC (agar dilution) measurements") 

Disk diffusion

cipro_res_mech %>%
  filter(!(Resistance.mech.type == "no_acquired_genes")) %>% # exclude these rows as not informative
  filter(Laboratory.Typing.Method == "Disk diffusion") %>%
  upset_plot("Resistance.mech.type", "Laboratory.Typing.Method")

ST analysis

Resistant strains displaying fluoroquinolone resistance mechanisms

ST count

# count STs with resistant phenotype
top_35_st_res <- cipro_antibiogram %>%
  filter(Resistance.phenotype == "resistant") %>%
  filter(Flq_mutations != "-" | Flq_acquired != "-" | Omp_mutations != "-") %>%
  group_by(ST) %>%
  summarise(sample_count = n()) %>%
  as.data.frame() %>%
  arrange(desc(sample_count)) %>%
  head(35) %>%
  mutate(ST = factor(ST, levels = ST, ordered = TRUE))

DT::datatable(top_35_st_res, 
              width = "50%",
              extensions = "FixedHeader",
              rownames = FALSE,
              options = list(paging=TRUE, fixedHeader=TRUE, scrollX=TRUE))

Summary bar plot

# Blues palette with 9 colors
colour_palette <- brewer.pal(9, "Blues") 

# Add more colors to this palette :
colour_palette <- colorRampPalette(colour_palette)(35)

top_35_st_res %>%
  ggplot(aes(y = ST, x = sample_count, fill = ST, label = sample_count)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(hjust = -0.5) +
  scale_y_discrete(limits = rev) +
  ggtitle("Top STs with fluoroquinolone resistance") +
  labs(x = "Sample count") +
  # reverse order of colour palette
  scale_fill_manual(values = rev(colour_palette)) +
  theme_minimal() +
  theme(legend.position = "none",
  axis.text = element_text(size = 10)) +
  coord_cartesian(clip = "off")

Resistant STs associated with specific fluoroquinolone resistance mechanisms

Proportion table
st_totals <- cipro_antibiogram %>%
  group_by(ST) %>%
  summarise(N=n())

st_res_mech <- cipro_res_mech %>%
  filter(Resistance.phenotype == "resistant") %>%
  filter(Resistance.mech.type != "wt_QRDR", Resistance.mech.type != "wt_Omp",
         Resistance.mech.type != "no_acquired_genes") %>%
  count(ST, Resistance.mech.type) %>% 
  left_join(st_totals) %>% # ST total counts
  mutate(prop = n/N) %>% # proportion per ST
  mutate(se = sqrt(prop*(1-prop)/N)) %>%
  mutate(p_lowerCI = prop-1.96*se, p_upperCI = prop+1.96*se) %>%
  mutate(p_lowerCI = replace(p_lowerCI, p_lowerCI<0, 0)) %>%
  mutate(p_upperCI = replace(p_upperCI, p_upperCI>1, 1)) %>%
  select(-se) %>%
  mutate(ST = factor(ST, levels = unique(ST)))

DT::datatable(st_res_mech, 
              extensions = "FixedHeader",
              rownames = FALSE,
              options = list(paging=TRUE, fixedHeader=TRUE, scrollX=TRUE))
Heatmap of frequencies of resistance determinants per ST
st_res_mech %>%
  filter(ST %in% st_res_mech$ST[st_res_mech$prop>0.1 & st_res_mech$N>50]) %>%
  ggplot(aes(x=ST, y = Resistance.mech.type)) +
  geom_tile(aes(fill=prop)) +
  labs(subtitle = "Fluoroquinolone resistance mechanisms in top resistant STs", 
       y = "Fluoroquinolone resistance mechanisms", fill = "Proportion per \nST") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8, colour = "black")) + 
  scale_fill_gradient(low = "white", high = "red")

Heatmap/UpSet plot showing %frequency of resistance determinants
st_res_mech %>%
  filter(ST %in% st_res_mech$ST[st_res_mech$prop>0.1 & st_res_mech$N>50]) %>%
  select(- c("p_lowerCI", "p_upperCI")) %>%
  group_by(ST, prop) %>% 
  summarize(res_mech_list = list(Resistance.mech.type)) %>%
  ggplot(aes(res_mech_list, ST, fill = prop)) + geom_tile() + 
  labs(title = "Proportion of fluoroquinolone resistance determinants per ST", 
       x = "Fluoroquinolone resistance determinants", y = "ST", fill = "Proportion per\nST") +
  scale_x_upset() +
  theme_combmatrix(combmatrix.panel.line.size = 0.8) + 
  scale_fill_gradient(low = "white", high = "red")

Resistant strains displaying no fluoroquinolone resistance mechanisms

Count table

no_res_mech <- cipro_antibiogram %>%
  filter(Flq_mutations == "-", Flq_acquired == "-", Omp_mutations == "-") %>%
  filter(Resistance.phenotype == "resistant")

st_no_res_mech <- no_res_mech %>%
  group_by(ST) %>%
  summarise(sample_count = n()) %>%
  as.data.frame() %>%
  arrange(desc(sample_count)) %>%
  mutate(ST = factor(ST, levels = ST, ordered = TRUE))

DT::datatable(st_no_res_mech,
              width = "50%",
              extensions = "FixedHeader",
              rownames = FALSE,
              options = list(paging=TRUE, fixedHeader=TRUE, scrollX=TRUE))

Bar plot

st_no_res_mech %>%
  filter(sample_count > 2) %>%
  ggplot(aes(ST, sample_count)) +
  geom_bar(stat = "identity", fill = "blue", alpha = 0.7) +
  labs(title = "Resistant STs with no fluoroquinolone resistance mechanisms", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Susceptible strains displaying fluoroquinolone resistance mechanisms

Count table

sus_res_mech <- cipro_res_mech %>%
  filter(Resistance.phenotype == "susceptible") %>%
  filter(Resistance.mech.type != "wt_QRDR", Resistance.mech.type != "wt_Omp",
         Resistance.mech.type != "no_acquired_genes")

st_sus_res_mech <- sus_res_mech %>%
  group_by(ST) %>%
  summarise(res_mech = Resistance.mech.type) %>%
  as.data.frame() %>%
  count(ST, res_mech, sort = TRUE) %>%
  mutate(ST = factor(ST, levels = unique(ST)))

DT::datatable(st_sus_res_mech,
              width = "50%",
              extensions = "FixedHeader",
              rownames = FALSE,
              options = list(paging=TRUE, fixedHeader=TRUE, scrollX=TRUE))

Bar plot

st_sus_res_mech %>%
  filter(n != 1) %>%
  ggplot(aes(ST, n, fill = res_mech)) +
  geom_bar(stat = "identity") +
  labs(title = "Fluoroquinolone resistance mechanisms employed by the top susceptible STs", y = "Count", fill = "Fluoroquinolone resistance\nmechanisms") +
  # put legend in one column
  guides(fill = guide_legend(ncol = 1)) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

UpSet plots

Disk diffusion
sus_res_mech %>%
  filter(Laboratory.Typing.Method == "Disk diffusion") %>%
  upset_plot("Resistance.mech.type", "Laboratory.Typing.Method")

MIC (broth dilution)
sus_res_mech %>%
  filter(Laboratory.Typing.Method == "MIC (broth dilution)") %>%
  upset_plot("Resistance.mech.type", "Laboratory.Typing.Method") +
  labs(title = "MIC (broth dilution) measurements") 

MIC (agar dilution)
sus_res_mech %>%
  filter(Laboratory.Typing.Method == "MIC (agar dilution)") %>%
  upset_plot("Resistance.mech.type", "Laboratory.Typing.Method") +
  labs(title = "MIC (agar dilution) measurements") 

Statistical analysis

Proportion test

# calculate frequencies of different resistance mechanisms for the different phenotypes
# including numbers for the wildtype groups for comparison to compare resistance frequency 
# amongst strains with the determinant vs strains without
mech_freq <- cipro_res_mech %>%
  group_by(Resistance.mech.type, Resistance.phenotype) %>%
  summarise(n = n()) %>%
  ungroup() %>%
  group_by(Resistance.mech.type) %>%
  pivot_wider(names_from = Resistance.phenotype, values_from = n) 

# convert na values to 0
mech_freq[is.na(mech_freq)] <- 0

# find totals of different resistance mechanisms
mech_freq <- mech_freq %>%
  mutate(total = sum(c(resistant, susceptible)))


# compare each gyrA/parC SNP vs wildtype QRDR
qrdr_wt <- mech_freq %>% filter(Resistance.mech.type == "wt_QRDR")
qrdr_mut <- mech_freq %>% 
  filter(startsWith(Resistance.mech.type, "GyrA") | startsWith(Resistance.mech.type, "ParC")) %>%
  mutate(wt_res = qrdr_wt[2], wt_total=qrdr_wt[4]) # add columns with the wildtype counts for res & total
  
# run prop.test() to each row, comparing the resistant counts (x) with the total counts (n)
qrdr_prop_out <- dapply(as.matrix(qrdr_mut), MARGIN = 1, 
       FUN = function(y) broom::tidy(prop.test(x=c(as.numeric(y[2]), as.numeric(y[5])), n=c(as.numeric(y[4]), as.numeric(y[6])))))

# combine qrdr_mut and qrdr_prop_out
qrdr_mut_freq_prop <- cbind(qrdr_mut, qrdr_prop_out) %>%
  select(resistant, total, estimate1, estimate2, p.value) %>%
  mutate(p.value=as.numeric(p.value))

qrdr_mut_freq_prop

# specific mutations observed more than once, significantly associated with elevated resistance (vs wildtype QRDR)
view(qrdr_mut_freq_prop %>% filter(total>1) %>% filter(p.value < 0.05))

# the data provides evidence that ANY mutation at the 4 codons we track is associated with resistance, even though for some individual mutations we only have one observation:
# check gyrA83, there are 5 mutations observed at this codon (A, F, I, L, Y)
# 4 have large numbers (≥10) and significant p-values, gyrA-83A is observed only once but is also resistant
qrdr_mut_freq_prop %>% filter(startsWith(Resistance.mech.type,"GyrA-83"))

# Similar for GyrA-87, except that GyrA87-H is only seen in one strain and it is sensitive, so exclude this one (not a borderline phenotype)
qrdr_mut_freq_prop %>% filter(startsWith(Resistance.mech.type,"GyrA-87"))

# ParC-80 and ParC-84 look the same as for GyrA-83, ie all mutations at the site have same effect can be grouped together
qrdr_mut_freq_prop %>% filter(startsWith(Resistance.mech.type,"ParC-80"))
qrdr_mut_freq_prop %>% filter(startsWith(Resistance.mech.type,"ParC-84"))

# based on the statistical analysis, QRDR mutations will be updated to have just 4 types 
# (GyrA-83, GyrA-87, ParC-80, ParC-84) instead of 19 most of which are rare


# compare each Omp mutations vs wildtype Omp
omp_wt <- mech_freq %>% filter(Resistance.mech.type == "wt_Omp")
omp_mut <- mech_freq %>% 
  filter(startsWith(Resistance.mech.type, "Omp")) %>%
  mutate(wt_res = omp_wt[2], wt_total=omp_wt[4]) # add columns with the wildtype counts for res & total

# run prop.test() to each row, comparing the resistant counts (x) with the total counts (n)
omp_prop_out <- dapply(as.matrix(omp_mut), MARGIN = 1, 
       FUN = function(y) broom::tidy(prop.test(x=c(as.numeric(y[2]), as.numeric(y[5])), n=c(as.numeric(y[4]), as.numeric(y[6])))))

omp_mut_freq_prop <- cbind(omp_mut, omp_prop_out) %>% 
  select(resistant, total, estimate1, estimate2, p.value) %>%
  mutate(p.value=as.numeric(p.value))

# all four mutations (truncation of either ompK35 or ompK36; and insertion of GD or TD in OmpK36, are all significantly associated with resistance)
view(omp_mut_freq_prop)


# compare acquired vs no acquired genes
no_acquired <- mech_freq %>% filter(Resistance.mech.type == "no_acquired_genes")
acquired_genes <- mech_freq %>% 
  filter(startsWith(Resistance.mech.type, "qnr") | startsWith(Resistance.mech.type, "qep") ) %>%
  mutate(wt_res = no_acquired[2], wt_total=no_acquired[4]) # add columns with the wildtype counts for res & total

# run prop.test() to each row, comparing the resistant counts (x) with the total counts (n)
acquired_genes_prop_out <- dapply(as.matrix(acquired_genes), MARGIN = 1, 
       FUN = function(y) broom::tidy(prop.test(x=c(as.numeric(y[2]), as.numeric(y[5])), n=c(as.numeric(y[4]), as.numeric(y[6])))))

acquired_genes_freq_prop <- cbind(acquired_genes, acquired_genes_prop_out) %>% 
  select(resistant, total, estimate1, estimate2, p.value) %>%
  mutate(p.value=as.numeric(p.value))

view(acquired_genes_freq_prop)

# from the statistical analysis, the following can be combined:
# qepA2 (n=1, which is resistant) with qepA2* (n=17, all of which are resistant)
# qnrA1 and qnrA1^ (note the ^ means they are identical at protein level anyway) as all are resistant
# qnrB1.v1, qnrB1.v2, qnrB1.v2^ for similar reasons
# qnrB19, qnrB19^
# qnrB2.1, qnrB2.2
# qnrS1, qnrS1?, qnrS1*
# could also combine all the 'rare' genes that are only seen 1-2 times each but are always resistant:
# -> qnrB7, qnrE2, qnrS2, qnrVC1, qnrVC6

Session information

sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.26.1          collapse_1.8.9     RColorBrewer_1.1-3 ggupset_0.3.0     
##  [5] scales_1.2.1       forcats_0.5.2      stringr_1.4.1      dplyr_1.0.10      
##  [9] purrr_0.3.5        readr_2.1.3        tidyr_1.2.1        tibble_3.1.8      
## [13] ggplot2_3.4.0      tidyverse_1.3.2    here_1.0.1        
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.9          lubridate_1.9.0     assertthat_0.2.1   
##  [4] rprojroot_2.0.3     digest_0.6.30       utf8_1.2.2         
##  [7] R6_2.5.1            cellranger_1.1.0    backports_1.4.1    
## [10] reprex_2.0.2        evaluate_0.18       highr_0.9          
## [13] httr_1.4.4          pillar_1.8.1        rlang_1.0.6        
## [16] googlesheets4_1.0.1 readxl_1.4.1        rstudioapi_0.14    
## [19] jquerylib_0.1.4     rmarkdown_2.18      labeling_0.4.2     
## [22] googledrive_2.0.0   htmlwidgets_1.5.4   munsell_0.5.0      
## [25] broom_1.0.1         compiler_4.2.2      modelr_0.1.10      
## [28] xfun_0.35           pkgconfig_2.0.3     htmltools_0.5.3    
## [31] tidyselect_1.1.2    fansi_1.0.3         crayon_1.5.2       
## [34] tzdb_0.3.0          dbplyr_2.2.1        withr_2.5.0        
## [37] grid_4.2.2          jsonlite_1.8.3      gtable_0.3.0       
## [40] lifecycle_1.0.3     DBI_1.1.3           magrittr_2.0.3     
## [43] cli_3.4.1           stringi_1.7.8       cachem_1.0.6       
## [46] farver_2.1.1        fs_1.5.2            xml2_1.3.3         
## [49] bslib_0.4.1         ellipsis_0.3.2      generics_0.1.2     
## [52] vctrs_0.5.1         tools_4.2.2         glue_1.6.2         
## [55] crosstalk_1.2.0     hms_1.1.2           parallel_4.2.2     
## [58] fastmap_1.1.0       yaml_2.3.6          timechange_0.1.1   
## [61] colorspace_2.0-3    gargle_1.2.1        rvest_1.0.3        
## [64] knitr_1.41          haven_2.5.1         sass_0.4.3